Simulating an integration profile

Let’s say we want to generate numerical values that satisfy the inequalities corresponding to a given profile. This can be done using the code below.

Simulating mean expression for a given integration profile

source("compute_profile_means.R")
source("compute_minimum_delta.R")
source("setPowerPointStyle.R")
setPowerPointStyle()
## Warning in par(tmag = 1.2): graphical parameter "tmag" is obsolete
PROFCODES = read.table("profile_codes_v2.txt",header = TRUE,sep = "\t") #contains the definitions of all profiles
load("constraints_vector")
prof_index = 3 #index of profile to be simulated (corresponds to an emergent synergy)
ntimes = 5 #n. of simulations
exp_min = 2 #min range of expression value
exp_max = 16 #max range of expression value
min_delta = 1 #minimum non-zero difference among any two comparisons
profile_means = compute_profile_means(PROFCODES, prof_index, ntimes, exp_min, 
                                      exp_max, constraints_vector, min_delta)[ , 1:4]

head(profile_means)
##                                             
## [1,]  3.979122  3.979122  3.979122  6.295081
## [2,] 10.146223 10.146223 10.146223 14.031306
## [3,]  9.992885  9.992885  9.992885 14.449260
## [4,]  8.793068  8.793068  8.793068 14.307622
## [5,]  8.624653  8.624653  8.624653 14.295739

Plotting one example

source("setPowerPointStyle.R")
setPowerPointStyle()

colnames(profile_means)=c("0","X","Y","X+Y")
barplot(profile_means[1,], ylab = 'simulated expression')
add = profile_means[1,1] + (profile_means[1,2] - profile_means[1,1]) + (profile_means[1,3] - profile_means[1,1])
abline(h = add, col="red")

Simulating random samples for a given inegration profile

After computing the means for a given profile, we can generate random samples resembling real data. We assume that real data come from normal distributions centered around the means computed above. The standard deviation is passed as a parameter, so it is possible to simulate arbitrary noise levels.

source("simulate_from_means.R")
source("setPowerPointStyle.R")
setPowerPointStyle()

samples = 4 #number of samples for each condition that will be simulated

noise_level = 0.5 #this means that the signal-to-noise is delta/noise_level = 1/0.5

design = factor(c(rep("0", samples), rep("X", samples), rep("Y", samples), rep("Y+X", samples)))

simulated_values = simulate_from_means(profile_means[1,], prof_index, samples, noise_level, exp_min, exp_max)
names(simulated_values) = design

boxplot(simulated_values ~ design, ylab = 'simulated expression', col = 'gray')

stripchart(simulated_values ~ design, vertical = TRUE, 
    method = "jitter", add = TRUE, pch = 20, col = 'black',cex=1.5)

Example with more noise.

source("setPowerPointStyle.R")
setPowerPointStyle()

noise_level = 1 #this means that the signal-to-noise is delta/noise_level = 1/1

simulated_values = simulate_from_means(profile_means[1,], prof_index, samples, noise_level, exp_min, exp_max)

boxplot(simulated_values ~ design, ylab = 'simulated expression', col = 'gray')

stripchart(simulated_values ~ design, vertical = TRUE, 
    method = "jitter", add = TRUE, pch = 20, col = 'black',cex=1.5)

Loading data

The data file should have the first two columns with annotation (for example probe ID and gene Symbol). Numerical data start from column 3.

data_file = "TNF_IFN_2.csv"
my_data = read.csv(data_file,sep = '\t')
head(my_data)
##   probe  gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1 IFN.2.5.2
## 1  RFC2  RFC2 8.324769 8.324769  8.132764 8.375789  8.373458  8.324769
## 2 HSPA6 HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876  7.950832
## 3  PAX8  PAX8 5.594199 5.581641  5.581641 5.661038  5.581641  5.581641
## 4  UBA7  UBA7 7.361664 7.307654  7.336399 7.336399  7.217897  7.197828
## 5  CCL5  CCL5 9.985708 8.999638 10.371103 4.692704  4.517400  4.996733
## 6 ESRRA ESRRA 6.335024 6.373706  6.539027 6.630504  6.847058  6.778308
##     TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1 TNF.IFN.2.5.2
## 1  8.324769  8.462647  8.386247    8.324769      8.338911      8.192397
## 2  6.248139  6.207699  5.940508    7.677756      7.484339      7.540568
## 3  5.581641  5.581641  5.581641    5.504055      5.411538      5.545741
## 4  7.671212  7.607462  7.470790    7.420842      7.336399      7.095814
## 5 12.476331 12.450789 12.326590    6.663396      5.642317      6.052862
## 6  6.155354  6.370346  6.397341    6.975653      6.886920      6.539027
#get numeric data
expression_data = my_data[,-(1:2)]

Preprocessing data

if (max(expression_data)>25) expression_data = log2(expression_data)

#removing uninformative probes (very small coefficient of variation)
cof_cutoff = 0.05

cof = apply(expression_data, 1, function(x) sd(x)/mean(x))

cof_filter = which(cof > cof_cutoff)

Checking data quality with PCA

#graphical parameters
source("setPowerPointStyle.R")
setPowerPointStyle()

my.pca <- prcomp(t(expression_data[cof_filter, ]), center = TRUE, scale=TRUE)

#we assume the same number of samples for each condition
samples = ncol(expression_data)/4

cols = c(rep("black", samples), rep("red", samples),
         rep("blue", samples), rep("yellow", samples))

plot(my.pca$x[, 1], my.pca$x[, 2], col = cols,
     xlab = "PC1", ylab = "PC2", pch = 20, cex = 1.5, main = data_file)

legend("bottomleft", pch = 20, col = unique(cols), 
       legend = c("0","X","Y","X+Y"), bty = 'n',cex = 1)

Filtering based on minimum group differences

source("filter_data_on_deltas.R")
design = factor(c(rep("0",samples),rep("X",samples),
                  rep("Y",samples),rep("Y+X",samples)))


my_data_filtered = filter_data_on_deltas(my_data, design = design)
 
head(my_data_filtered)
##      probe    gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1
## 2    HSPA6   HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876
## 5     CCL5    CCL5 9.985708 8.999638 10.371103 4.692704  4.517400
## 10     PXK     PXK 9.294279 9.508841  9.240162 9.846890 10.014370
## 11 MSANTD3 MSANTD3 9.590901 9.274737  9.831871 7.027361  7.053774
## 12 SLC46A1 SLC46A1 6.048298 6.617517  6.384521 7.951326  7.867392
## 14    PIGX    PIGX 9.020155 8.955406  9.036642 8.261126  8.799131
##    IFN.2.5.2   TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1
## 2   7.950832  6.248139  6.207699  5.940508    7.677756      7.484339
## 5   4.996733 12.476331 12.450789 12.326590    6.663396      5.642317
## 10 10.093442  8.939042  9.193210  8.997016    9.972881     10.101698
## 11  7.320024  8.754430  8.966617  8.826732    7.349552      7.153825
## 12  7.562287  6.122274  6.428453  6.443509    8.016323      7.685079
## 14  8.650005  8.832036  8.905883  8.848334    9.028034      9.129482
##    TNF.IFN.2.5.2
## 2       7.540568
## 5       6.052862
## 10     10.142173
## 11      7.248851
## 12      7.787337
## 14      8.781471

Random forest classifier

source("find_optimal_match.R")

my_data_filtered_matched = find_optimal_match(my_data_filtered)

head(my_data_filtered_matched)
##      probe    gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1
## 2    HSPA6   HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876
## 5     CCL5    CCL5 9.985708 8.999638 10.371103 4.692704  4.517400
## 10     PXK     PXK 9.294279 9.508841  9.240162 9.846890 10.014370
## 11 MSANTD3 MSANTD3 9.590901 9.274737  9.831871 7.027361  7.053774
## 12 SLC46A1 SLC46A1 6.048298 6.617517  6.384521 7.951326  7.867392
## 14    PIGX    PIGX 9.020155 8.955406  9.036642 8.261126  8.799131
##    IFN.2.5.2   TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1
## 2   7.950832  6.248139  6.207699  5.940508    7.677756      7.484339
## 5   4.996733 12.476331 12.450789 12.326590    6.663396      5.642317
## 10 10.093442  8.939042  9.193210  8.997016    9.972881     10.101698
## 11  7.320024  8.754430  8.966617  8.826732    7.349552      7.153825
## 12  7.562287  6.122274  6.428453  6.443509    8.016323      7.685079
## 14  8.650005  8.832036  8.905883  8.848334    9.028034      9.129482
##    TNF.IFN.2.5.2 score prof_index
## 2       7.540568 0.290         52
## 5       6.052862 0.536        108
## 10     10.142173 0.426         86
## 11      7.248851 0.500         68
## 12      7.787337 0.660         21
## 14      8.781471 0.598          2

Visualizing all integration profiles with frequency plots

source("visualize_all_profiles.R")
source("setPowerPointStyle.R")
setPowerPointStyle()

visualize_all_profiles(my_data_filtered_matched)

Appendix: plotting all profiles

source("generate_all_profiles.R")
generate_all_profiles()